Setup

Load data from proquest on 2020-2024-02-01 sociology department dissertations in English. Clean and tokenise as 2-skip-1 ngrams

Code
DOSTEM <- FALSE
Code
if (!file.exists("data/coded_abstracts.xlsx") || 
    !file.exists("data/ngram_seeds.xlsx")   || TRUE
    ) {
requireNamespace("googledrive")
folderid <- googledrive::as_id("https://drive.google.com/drive/folders/1eFat4mCoRZX22gPLTj9w5Bn63U3q71yo")

filels <- googledrive::drive_ls(folderid)

googledrive::drive_download(
  file = filels %>% filter(name=="abstracts_hand_coding") %>% pull(`id`),
  path = "data/coded_abstracts",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="socabs_topic_seeds") %>% pull(`id`),
  path = "data/ngram_seeds",
  type= "xlsx",
  overwrite=TRUE)

googledrive::drive_download(
  file = filels %>% filter(name=="stems_and_stops") %>% pull(`id`),
  path = "data/stems_and_stops",
  type= "xlsx",
  overwrite=TRUE)

 rm(folderid, filels)
}
Loading required namespace: googledrive
! Using an auto-discovered, cached token.
  To suppress this message, modify your code or options to clearly consent to
  the use of a cached token.
  See gargle's "Non-interactive auth" vignette for more details:
  <https://gargle.r-lib.org/articles/non-interactive-auth.html>
ℹ The googledrive package is using a cached token for 'micah.altman@gmail.com'.
File downloaded:
• 'abstracts_hand_coding' <id: 1FO5AjIpe0iIgLgTaHBK8b7OLUR29JY9Z1aGxOWZkJ9U>
Saved locally as:
• 'data/coded_abstracts.xlsx'
File downloaded:
• 'socabs_topic_seeds' <id: 1l9I2kvMUFkVpmcMfm2pfcXCyU18fX4En5ptfu9qu8zo>
Saved locally as:
• 'data/ngram_seeds.xlsx'
File downloaded:
• 'stems_and_stops' <id: 1sp16X7rED97CXjHFloUOOg10A2mb3VaH3Dyfxjp-dv0>
Saved locally as:
• 'data/stems_and_stops.xlsx'
Code
coded_abs.df <- readxl::read_excel("data/coded_abstracts.xlsx")
Code
abs1.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.xls")
abs2.df <- readxl::read_excel("data/ProQuestDocuments-2024-02-01.2.xls")
diss_meta.df <- bind_rows(abs1.df, abs2.df)
rm("abs1.df","abs2.df")

# NOTE:
# Columns in source overal with each other -- only selected columns used
#  - all duplicates: classification, subjectClassifications, classiifcationCOdes, majorClassificationsCodes
# - subjectTerms appears to be an automated coarse recoding of the classification
# - appears author assigned, duplicate columns: identifierKeywords, subjects
#   apparently post-processed to add "GenderWatch" and "y" tags 

diss_cleaned.df <- diss_meta.df %>% 
  select(isbn, Authors, classification, subjectTerms, pubdate,
         Abstract, Title, identifierKeywords) %>%
  mutate(classification = replace_na(classification,""),
         subjectTerms = replace_na(subjectTerms,""),
         identifierKeywords = replace_na(identifierKeywords,""),
         pubyear_clean = year(as_date(pubdate,format="%Y"))
         ) %>%
  rowwise() %>%
  mutate(classification_clean = 
          str_split_1(classification ,pattern = ",") %>% 
           str_squish() %>%
          str_replace("^[0-9]+ ","") %>%
          str_to_lower() %>% unique() %>% list() ,
         subject_terms_clean = str_split_1(subjectTerms,pattern = ",") %>%
           str_squish() %>% str_to_lower() %>% unique() %>%
           list(), 
          au_identifier_terms_clean =
           str_split_1(identifierKeywords,pattern = ",") %>%
           str_squish() %>% str_to_lower() %>% unique() %>%
           list()
  ) %>% 
    ungroup() %>%
    select(isbn, Authors, classification_clean, subject_terms_clean, pubyear_clean,
           Abstract,Title,identifierKeywords, au_identifier_terms_clean)

diss_cleaned.df %>%
  count(pubyear_clean) %>%
  rename(year=pubyear_clean)-> dy.df
dy.ls <- dy.df %>% pull(n)
names(dy.ls) <- dy.df %>% pull(year)

rm(diss_meta.df)

Author Characteristics

Code
requireNamespace("opengender")
Loading required namespace: opengender
Code
diss_cleaned_plus_gender.df <- diss_cleaned.df %>% 
  mutate(given=  str_split_i(Authors,pattern=",",2)
         %>% str_squish() %>% 
           str_split_i(pattern="[:space:]",1)) 

diss_cleaned_plus_gender.df %<>% opengender::add_gender_predictions()

diss_cleaned_plus_gender.df %>% 
  summarize(opengender::gender_mean(og_details,simplify_output="row"))
# A tibble: 1 × 3
  prop_F prop_M prop_O
   <dbl>  <dbl>  <dbl>
1  0.591  0.409      0

#Text prep

Code
# subset of the snowball stopwords

minimal_stopwords <-
  c("a",  "am", "an", "and", "any", "are", "aren't", "as", "at", "be","but", "by",  "did", "didn't", 
"do", "does", "doesn't", "doing", "don't", "down", "during", 
"each", "for", "from", "further", "had", "hadn't", "has", 
"hasn't", "have", "haven't", "having", "how", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "my", "myself",  "of", "on",  "or", "other",  "so", "than", "that", "that's", "the","their", "they", "them", "then", "there", "there's", "these", "this", "how", "to", "too",  "was", "wasn't", "when", "when's", "where", "where's", "which", "while", "will", "with", "won't")

combined_tidy.df <-  diss_cleaned.df %>%
  select(isbn,Abstract,Title,identifierKeywords) %>%
  unite(col="Clean_combined", sep=" ", remove=TRUE, na.rm=TRUE,
        Abstract,Title,identifierKeywords) %>%
  mutate(Clean_combined =
           str_replace_all(Clean_combined, "U\\.S\\.","USA")) %>%
  mutate(Clean_combined =
           str_replace_all(Clean_combined, "[-\\)\\(\\&\"/]"," ")) %>%
  mutate(Clean_combined =
           str_replace_all(Clean_combined, "[^a-zA-Z \\.']", ""))  %>% 
  mutate(Clean_combined =
           str_squish(Clean_combined)) %>% 
  unnest_tokens(ngram, "Clean_combined", token = "skip_ngrams", n = 2,k=1,
                stopwords=minimal_stopwords)  %>%
  mutate(ngram = str_squish(ngram)) %>% 
  filter(!str_detect(ngram,"\\.")) %>%
  filter(ngram!="")

if (DOSTEM) {
  combined_tidy.df %<>%
     mutate(ngram = textstem::stem_strings(ngram, language="english"))
}

combined_tf_idf.df <- 
  combined_tidy.df %>% 
  count(ngram,isbn) %>%
    bind_tf_idf(ngram, isbn, n)
Code
stop_ngrams <- c( 
"associated with", 
"interviews with",  
"these findings", 
"with their", 
"my dissertation",
"first chapter",
"second chapter", 
"empirical chapter", 
"my research", 
"depth with", 
"my findings",
"dissertation three", 
"et al",
"third chapter", 
"three studies",
"their with", 
"with social",  
"with other", 
"interactions with",
"taken together", 
"empirical chapters",
"my study", 
"they also", 
"women who",
"who were", 
"findings study", 
"people who", 
"relationships with", 
"these results", 
"their their", 
"these studies", 
"they were", 
"attitudes toward", 
"since s", 
"these suggest", 
"three papers", 
"women with", 
"along with", 
"these two",
"with more", 
"students their",  
"their identities", 
"their health", 
"their parents", 
"were likely", 
"with these", 
"chapter two", 
"chapter uses",
"their counterparts",
"with lower",  
"structured with", 
"students with", 
"study explores", 
"these three", 
"together these", 
"health health", 
"students who",
"th century", 
"their identity",
"third study",
"also find",
"chapter three", 
"across three", 
"all three", 
"chapter examine",
"one another", 
"previous research", 
"second paper", 
"were with", 
"first paper", 
"final chapter", 
"with focus", 
"study also", 
"with mental", 
"with parents", 
"with women", 
"women their", 
"social social", 
"within context", 
"individuals with", 
"other hand", 
"other social",  
"following questions", 
"three questions", 
"individuals who", 
"third paper", 
"three distinct",
"while also", 
"with high", 
"address these", 
"children with", 
"conducted with",
"dissertation social", 
"existing literature", 
"health among", 
"results study", 
"their communities", 
"there little", 
"what call", 
"who their", 
"with family", 
"among women", 
"different social", 
"experiences their", 
"findings dissertation", 
"research also", 
"study examine", 
"their political", 
"these factors", 
"they navigate", 
"with black", 
"with data", 
"with levels", 
"with one", 
"women more",
"engagement with",
"finally chapter", 
"level factors", 
"little known", 
"men who", 
"more their", 
"with racial", 
"adults with",
"data with",
"dissertation two", 
"engage with",
"more specifically",
"study social", 
"with different", 
"with health", 
"chapter four", 
"cope with", 
"different types",
"existing research", 
"one most", 
"parents their",
"social dissertation", 
"their they", 
"these changes", 
"these processes", 
"were conducted", 
"who with", 
"with illness", 
"with people", 
"among older", 
"contributes understanding", 
"current study", 
"interact with", 
"late s",
"overall dissertation", 
"people their", 
"prior research", 
"their family", "their status", "them their", "they with", "vary across", 
"what they", "with who", "across different", 
"across united", "chapter analyzes", "data study", 
"however these", "level data", 
"research question", 
"study uses", "support their", 
"these experiences", "these were", "within their", "work with", 
 "based their",  "chapter investigates", 
 "different groups", "early s", 
  "first examine", 
"individuals their",
"methods approach", 
"months ethnographic", 
"months fieldwork", 
"their experience", 
"their relationships", 
"third empirical", 
"with political", "with students", "years ethnographic", "across all", 
"across groups", "among adults", "children their", 
"different ways", "dissertation with", "impact their", "research social", 
"shape their", "there research", "these data", "using methods", 
"were associated",  "association with", 
 "even when", "examine social", "few studies", "findings also", 
"first two", 
"relationships their", "sense their", 
"their these", "these can", "they can", 
"were less", 
"with greater", "with increased", "chapter dissertation", 
"conclude with", 
"dissertation research", "dissertation use", "questions what", 
"three different", "three dissertation", 
"two different", "with non",
"within social", "women were",  
 "implications research", "mid s", "positively with", 
 "st century", "study chapter", "study three", 
 "these analyses",  "were used", "what extent", "while research", 
"with implications", "with low", "with self", 
"work their",  "other words",  "research with", 
 "their role", "these also", 
"these dissertation", "two chapters", 
"were significantly", "when their",  "with men", 
"with new", "with respect",  
"due their",  "first dissertation",  
"however there", 
"most important", "my first", "my suggest", "social well", "study two", 
"themselves their", "these groups", "these strategies", "three key", 
 "what can",  
"can help",  "consists three", 
 "dissertation also", "experience their", 
"explore these",
"we find", "were also",  "with discussion", 
"also their",  "combined with", "compared their", 
"contact with", "data depth", "data three", "dissertation study", 
"finally study",  
"findings research",
 "methods used", "more more", "more with", 
"other forms", "overall findings", 
"overall study",  "research focused", 
"results these", "school with",  "study with",  
"these chapters",  "three years", 
 "using case", "work conflict",
 "chapter whether", 
 "correlated with", "data dissertation", 
"data were", "despite their",  "however research", 
"men their", "my results",  "other factors", 
 "present study", "research these", "research what", 
"states with", "structural cultural", "study adult", "their everyday", 
 "these provide", "these spaces", 
"three research", "use their", 
"also more", "also show",  "concludes with", 
"examines social",    
"level characteristics", "much more", "my also", 
"my show", "my work", 
"outcomes with", "taken these", 
"their first", "their interactions", "them with", 
 "these differences", "these issues", "these outcomes", 
"these research", "they did", "they face",
"genderwatch"
)

stop_post_tokens<- c("with", "", "social", "these", "dissertation", "study", 
"research", "more", "they", "chapter",  
 "also", "three",  "who", 
"first",  "were",  "findings", 
"analysis", "use", "while", "within", "two", "my", "other", 
"using", "across", "among", "can", "level", 
 "based",  
 "find", "well", 
"what",  "time", "second", "when",  "studies",  "one",  
"may", "care", 
"however",
"non", 
"them", 
"all",  "there", "used", 
"third", "including",  "s", 
 "four",  "upon", 
 "n", 
"y",
"co", "xa", "five", "re", "al", "et", 
"et al")

stop_post.df <- tibble(ngram=c(stop_post_tokens, stop_ngrams))
stop_post_minimal.df <- tibble(ngram=c("with"))

combined_tf_idf_stemmed.df <- combined_tf_idf.df
rm(stop_post_tokens,stop_ngrams)

Distributions of topics (unclustered)

Dissertations by Controlled Classifications

Code
diss_cleaned.df %>% 
  rename(term=classification_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term)) -> diss_class_tidy.df

ndis <- sum(dy.ls)
lower_q <- .05
upper_q <- .8

{diss_class_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation assigned to controlled classifications")} %>% plotly::ggplotly()
Code
{diss_class_tidy.df %>% 
  filter(pubyear_clean < 2024) %>%
  count(term, pubyear_clean, sort=TRUE) %>%
  rowwise() %>%
  mutate(p=n/ndis,
         p_year=n/dy.ls[[as.character(pubyear_clean)]]) %>%
  filter(p_year >=  lower_q ,
         p_year <=  upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p_year))+
  geom_col() +
  coord_flip() +
  labs(x="%age dissertations in assigned  classification over time") +
  facet_wrap(vars(pubyear_clean))} %>% plotly::ggplotly() 
Code
rm(diss_class_tidy.df, lower_q, upper_q)

Dissertations by Author-Assigned Topics

Code
stop_topics.df <- tibble(term=c("y","genderwatch"))

diss_cleaned.df %>% 
  rename(term=au_identifier_terms_clean) %>%
  select(isbn, term, pubyear_clean) %>% 
  unnest(cols=c(term))  %>%
  anti_join(stop_topics.df,by="term") %>%
  distinct() -> diss_au_id_tidy.df

lower_q <- .025
upper_q <- 1

{diss_au_id_tidy.df %>% 
  count(term, sort=TRUE) %>%
  mutate(p = n/sum(dy.ls)) %>%
  filter(p>= lower_q ,
         p <= upper_q ) %>%
  ggplot(aes(x=fct_reorder(term,p),y=p))+
  geom_col() +
  coord_flip() +
  labs(x="%age of dissertation by author-assigned keywords")} %>% plotly::ggplotly()
Code
rm(diss_au_id_tidy.df, stop_topics.df)

Dissertations by Terminology

Code
unc_doc_freq.df <-
  combined_tf_idf_stemmed.df %>% 
  count(ngram) %>%
  mutate(p=n/sum(dy.ls))

lower_q <- .25 
upper_q <- .75

unc_doc_freq.df  %>% 
      slice_max(order_by=n, n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="terms appearing in most dissertations - excluding stopwords") 

Code
{unc_doc_freq.df %>% 
  mutate(p = n/sum(dy.ls),
         ngram=fct_reorder(ngram,p)) %>%
  filter(p>= lower_q,
         p<= upper_q) %>%
  ggplot(aes(x=ngram,y=p))+
  geom_col() +
  coord_flip() +
  labs(x="terms appearing in [25%-75%] of dissertations (excluding stopwords, min 1%)")} %>% plotly::ggplotly()
Code
unc_doc_freq.df  %>% 
    slice_max(order_by=n, n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="ngrams appearing in most dissertations"
    )
Code
rm(unc_doc_freq.df, lower_q, upper_q)

Distribution of Terminology Across Corpus

Code
unc_ngram_freq.df <- 
  combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarise(n=sum(n), .groups="drop") %>%
  arrange(n)

unc_ngram_freq.df  %>% 
      slice_max(order_by = n,n=200) %>%
      rename(freq=n,word=ngram) %>%
      ggwordcloud::ggwordcloud2(size=.8) +
      labs(title="most popular uncontrolled terms - excluding stop words") 

Code
unc_ngram_freq.df  %>% 
    slice_max(order_by = n,n=1000) %>%
    DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most frequent 1 and 2 word terms in uncontrolled description"
    )
Code
rm(unc_ngram_freq.df)

Most Distinctive Terms in Each Dissertation

Code
nterms <- 5
ndiss <- 100

topterms.df <-
  combined_tf_idf_stemmed.df %>% 
  group_by(isbn) %>%
  slice_max(order_by=tf_idf, n=nterms) 

topdis.df <-
  topterms.df %>%
  group_by(isbn) %>%
  summarize(mean_tf_idf = mean(tf_idf)) %>%
  ungroup() %>%
  slice_max(order_by=mean_tf_idf, n= ndiss)
  
distinctive_diss.df <-
  left_join(topdis.df %>% select(isbn),
            topterms.df %>% select(isbn, ngram),
            by = "isbn") %>% 
  group_by(isbn) %>%
  summarize(distinct_terms = paste(ngram, sep =" ", collapse=", ")) %>% 
  ungroup() %>%
  left_join(diss_cleaned.df %>% select(Title,isbn), by="isbn")

distinctive_diss.df %>%
  relocate(Title, distinct_terms) %>%
  select(-isbn) %>%
  DT::datatable (
      data = .,
      extensions = 'Buttons',
      options = list(dom = 'Bfrltip',
                     buttons = c('csv')),
      caption ="Most distinctive terms in dissertations with distinctive terms"
    )
Code
rm(nterms,ndiss, topterms.df, topdis.df, distinctive_diss.df)

Topic Coding

Hand Coded Abstracts

Code
coded_isbns.df <- NULL
Code
hand_codes_tmp.df <-
  coded_abs.df %>% select(isbn,contains(':'),EpiStyle_type) %>% 
  filter( if_any(everything(), ~ !is.na(.x)) )

hand_codes_tmp.df %<>% 
  mutate(EpiStyle_type = str_to_lower(EpiStyle_type)) %>%
  #select(isbn,EpiStyle_type) %>%
  pivot_wider(names_from=EpiStyle_type, values_from=EpiStyle_type,
              names_prefix="epi:") %>% 
  select(-"epi:NA") %>%
  mutate(across(!isbn, 
                  ~ case_when(is.na(.x) ~ FALSE,  .default = TRUE)))

hand_codes_tmp.df %<>%
  pivot_longer(!isbn) %>%
  rename(topic=name) %>%
  mutate(coding_src="hand")

coded_isbns.df %<>% 
  bind_rows(hand_codes_tmp.df)
 
hand_code_tot.df <-
   hand_codes_tmp.df %>% 
    select(-isbn,coding_src) %>%
    filter(value) %>%
    count(topic,value) %>%
    select(-value) %>%
    separate_wider_delim(topic,
                         names=c("dimension","category"),delim=":")

{hand_code_tot.df %>%
    mutate(dimension = as.factor(dimension),
           category=as.factor(category)) %>%
    ggplot(aes(y=n,x=dimension,  fill=category,label=category)) +
    geom_col(position=position_dodge2()) +
    geom_text(position=position_dodge2(width=1), vjust=1) +
    theme(legend.position="none") + 
    facet_wrap(vars(dimension), scales="free_x")} %>%
  plotly::ggplotly()
Code
rm(hand_codes_tmp.df, hand_code_tot.df) 

Exact Match to Coded Topics Terms

Code
seeds_from_codes.df <- coded_abs.df %>% 
  select(contains(':')) %>% 
  pivot_longer(cols=everything(), names_to="topic", values_to = "ngram") %>% 
  filter(!is.na(ngram) & ngram != 'x') %>%
  separate_longer_delim(ngram,';') %>%
  mutate(ngram=str_squish(ngram)) %>%
  filter(ngram!='')

seeds_from_ngrams.df <- 
  suppressMessages( readxl::read_excel("data/ngram_seeds.xlsx") )%>%
  rename(topic=Topic,ngram=Ngram) %>% select(topic,ngram)

seeds_from_ngrams.df %<>% 
  separate_longer_delim(topic,";") %>% 
  filter(!is.na(`topic`) & str_detect(`topic`,":")) %>%
  mutate(topic = str_squish(`topic`))  %>%
  filter(topic!="")

seeds.df <- bind_rows(seeds_from_codes.df, seeds_from_ngrams.df) %>% 
  mutate(ngram = str_to_lower(ngram) %>% 
           str_replace_all(pattern="-"," ") %>%
           str_squish()
           ) %>%
  distinct()

rm(seeds_from_codes.df, seeds_from_ngrams.df)
Code
exact_matches.df <- 
  left_join(seeds.df,
            combined_tf_idf_stemmed.df %>% distinct(),
            relationship="many-to-many",
            by = "ngram"
            )


exact_matches.df %>% 
  select(topic,ngram,isbn) %>%
  distinct() %>%
  group_by(topic,ngram) %>%
  summarize(n=n(), per_dis=n()/sum(dy.ls), .groups="drop") %>%
  arrange(desc(per_dis)) %>% 
  slice_head(n=15) %>%
  gt::gt() %>%
  gt::tab_header(title="most frequently matched topic terms")
most frequently matched topic terms
topic ngram n per_dis
meth:empirical interviews 686 0.3664530
meth:qual interviews 686 0.3664530
claim:desc examine 542 0.2895299
claim:desc examines 471 0.2516026
subj:health health 455 0.2430556
subj:gender gender 438 0.2339744
scope:us united states 435 0.2323718
subj:econ economic 405 0.2163462
subj:race race 404 0.2158120
meth:theory theory 377 0.2013889
subj:race racial 345 0.1842949
meth:qual qualitative 343 0.1832265
subj:education education 328 0.1752137
subj:inequality inequality 321 0.1714744
claim:desc explore 318 0.1698718
Code
nmatched_dis <- exact_matches.df %>% pull(isbn) %>% unique() %>% length

exact_matches.df %<>% 
  select(topic,isbn) %>% 
  distinct() %>%
  mutate(value=TRUE, coding_src="exact")

exact_matches.df %>%
  count(topic) %>%
  ungroup() %>%
  separate_wider_delim(topic,names=c("dimension","category"),delim=":") -> topic_sum.df 

topic_sum.df %<>% 
  group_by(dimension) %>%
  mutate(p=n/sum(dy.ls)) %>%
  ungroup()

lower_q <- .15 
coded_isbns.df %<>% 
  bind_rows(exact_matches.df)

{topic_sum.df%>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms")
  } %>% plotly::ggplotly()
Code
{topic_sum.df%>%
  filter(p>lower_q) %>%
  ggplot(aes(y=p,x=dimension,fill=category,label=category)) +
  geom_col(position=position_dodge2(width=1)) +
  geom_text(position=position_dodge2(width=1), vjust=1) +
  theme(legend.position="none") + 
  facet_wrap(vars(dimension), scales="free_x") +
  labs(x="percent of dissertations that contain topic-seed terms, excluding rare topics")
  } %>% plotly::ggplotly()
Code
topic_sum.df %>% 
  group_by(dimension) %>%
  gt::gt() %>%
  gt::fmt_percent(columns="p")
category n p
claim
desc 1538 82.16%
mod 801 42.79%
strong 321 17.15%
weak 374 19.98%
epi
activist 108 5.77%
constructive 31 1.66%
pos 30 1.60%
util 92 4.91%
meth
empirical 1562 83.44%
longitudinal 241 12.87%
qual 1104 58.97%
quant 843 45.03%
theory 668 35.68%
scope
us 951 50.80%
world 366 19.55%
subj
class 450 24.04%
crim 418 22.33%
econ 895 47.81%
education 558 29.81%
environment 317 16.93%
ethnicity 290 15.49%
family 583 31.14%
gender 553 29.54%
health 646 34.51%
identity 388 20.73%
immigration 271 14.48%
inequality 1080 57.69%
lifecourse 202 10.79%
media 238 12.71%
movements 263 14.05%
networks 257 13.73%
orgs 557 29.75%
politics 607 32.43%
race 700 37.39%
religion 118 6.30%
rural 100 5.34%
sexuality 185 9.88%
stigma 90 4.81%
urban 280 14.96%
violence 200 10.68%
youth 178 9.51%
Code
rm(lower_q,exact_matches.df)

Topic Models

Code
requireNamespace("quanteda")
Loading required namespace: quanteda
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"pcorMatrix" of class "replValueSp"; definition not updated
Code
requireNamespace("keyATM")
Loading required namespace: keyATM
Code
topic_lower_q <- .15
unseeded_topics <- 3
seed_prior_strength <- .95
n_subj_categories <- 5
lower_q <-  topic_lower_q
upper_q <- .9
exclude_dim <- c("claim")

combined_tf_idf_stemmed.df %>% 
  group_by(ngram) %>%
  summarize(n=n()) %>%
  mutate(p=n/sum(dy.ls)) -> ngram_diss_ct.df

core_ngrams.df  <-
  ngram_diss_ct.df %>%
  filter( p >= lower_q, p<= upper_q) %>%
  select("ngram") %>%
  bind_rows(seeds.df %>% select(ngram))  %>%
  distinct() %>%
  left_join(combined_tf_idf_stemmed.df %>% select(ngram,isbn,n)) %>%
  na.omit() %>%
  distinct()
Joining with `by = join_by(ngram)`
Code
core_dfm <- 
  core_ngrams.df %>%
  rename(term=ngram, document=isbn, value=n) %>% 
  cast_dfm(document=document,term=term,value=value)

keyATM_docs <- keyATM::keyATM_read(texts = core_dfm)
ℹ Using quanteda dfm.
Code
excluded_topics<- topic_sum.df %>%
  filter(p<topic_lower_q) %>% 
  select(dimension,category) %>% 
  transmute(topic=paste(dimension,category,sep=":"))

excluded_areas <- topic_sum.df %>%
    filter(dimension %in% exclude_dim) %>%
      transmute(topic=paste(dimension,category,sep=":"))

excluded_subjects <- topic_sum.df %>%
 filter(dimension=="subj") %>% 
 arrange(desc(p)) %>% 
 slice_tail(n=-1*n_subj_categories ) %>%
 select(dimension,category) %>% 
 transmute(topic=paste(dimension,category,sep=":"))

seeded_topics.ls <- seeds.df %>% 
  anti_join(bind_rows(excluded_topics,excluded_subjects, excluded_areas), by ="topic") %>%
  group_by(`topic`) %>% 
  summarise(ngram_list=list(ngram)) %>% 
  pmap( function(topic,ngram_list) { x<-list(); x[[topic]] <- ngram_list; x} ) %>% 
  list_flatten() 

key_viz <- keyATM::visualize_keywords(docs = keyATM_docs, 
                                      keywords = seeded_topics.ls)
Warning: Keywords are pruned because they do not appear in the documents: semi
structured interviews, in depth interviews, snowball sample, qualitative case
study, ethography, qualitative methodological approach, extended case logic,
event history analysis, american time use survey, their mental, with
disabilities, critical race theory, doing gender, shares of housework, low wage
jobs, labor market disadvantages, cumulative inequality theory, sex based
discrimination, …, displacement of workers, and their racial
Code
key_viz
Warning: Removed 201 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 201 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 201 rows containing missing values or values outside the scale range
(`geom_label_repel()`).
Warning: ggrepel: 507 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Code
suppressMessages(
  lda_key <- keyATM::keyATM(
  docs              = keyATM_docs,    # text input
  no_keyword_topics = unseeded_topics,              # number of topics without keywords
  keywords          = seeded_topics.ls,       # keywords
  priors            = list(beta_s=seed_prior_strength),
  model             = "base",         # select the model
))
Warning: Keywords are pruned because they do not appear in the documents: semi
structured interviews, in depth interviews, snowball sample, qualitative case
study, ethography, qualitative methodological approach, extended case logic,
event history analysis, american time use survey, their mental, with
disabilities, critical race theory, doing gender, shares of housework, low wage
jobs, labor market disadvantages, cumulative inequality theory, sex based
discrimination, …, displacement of workers, and their racial
Code
lda_key %>% keyATM::plot_modelfit()

Code
lda_key %>% keyATM::top_words() %>%
  gt::gt() %>% 
  gt::tab_header("Top words for each topic")
Top words for each topic
1_meth:empirical 2_meth:qual 3_meth:quant 4_meth:theory 5_scope:us 6_scope:world 7_subj:econ 8_subj:health 9_subj:inequality 10_subj:politics 11_subj:race Other_1 Other_2 Other_3
social social study neighborhood states migration work health [✓] gender political black [✓] violence police students
chapter study between urban united immigrant labor [✓] mental [✓] women state racial [✓] criminal sexual school
data research were neighborhoods united states [✓] immigrants economic [✓] mental health [✓] inequality [✓] social race [✓] incarceration law education
dissertation interviews [✓] factors social policy world [✓] workers life family justice [✓] white [✓] prison violence college
between through more theory [✓] state migrants employment [✓] social sexual politics [✓] ethnic punishment policing schools
more community results online religious migrant economy [✓] medical [✓] gendered movements american incarcerated legal parents
research about data sustainability local [✓] ethnic management [✓] healthcare [✓] discrimination [✓] media racism [✓] criminal justice transgender educational
study not effects gentrification [9] immigration status job [✓] family rural change racialized [✓] youth gender student
three who not environmental usa [✓] refugees dissertation support inequalities [✓] environmental ethnicity young adults people children
first dissertation among support public immigration family covid [✓] class public indigenous [✓] adulthood law enforcement higher
Code
lda_key %>% keyATM::top_docs(n=5) %>% 
  pivot_longer(cols=everything()) %>%
  rowwise() %>%
  mutate(title=diss_cleaned.df[[value,"Title"]]) %>%
  ungroup() %>% 
  select(-value) %>%
  group_by(name) %>%
  gt::gt() %>%
  gt::tab_header("Top dissertations in each topic")
Top dissertations in each topic
title
1_meth:empirical
Neutralizing Title IX: Hyperlegal Consciousness on College Campuses in the Age of #MeToo
The Moderating Role of State-Level Policy on Individual-Level Social Inequalities in Mental Health: A Cross-National Perspective
The Biopolitics of Third Gender Category in India
A Re-Evaluation of the Hyper-Selectivity Perspective: The Case of Second-Generation Filipinos
A “Stupid Little Fish”: Science, Law and the Politics of Environmental Decline in California
2_meth:qual
Where We Die: County-Level Disparities and Constrained Choice at the End of Life
Journeying: Black Girlhood, Movement and the Refusal of (In)Justice
A Melting Fossil: Constituting Planetary Time in the Cryosphere, 1815-1980
Humanizing Bureaucracy: Exploring the Experiences of Public Sector Managers Directed to Enact Transformational Leadership in a State Agency
Open Pharma: Collective Action to Common Pharmaceutical Knowledge
3_meth:quant
Labor Control and the Experience of Work in the Platform Economy
Who Pays Income Inequality’s Health Tax? Toward a Conditional Model of Economic Stratification and Population Health
Raising Black Excellence: An Exploration of How a Black Operated Seventh-Day Adventist School Empowers Black Student Achievement and Development through Liberation and Resistance
Access to Clean Water: A Cross-National Analysis 1990–2012
Mobility Ideologies: Precarity and Meaning-Making in the College-for-All Generation
4_meth:theory
Sexuality Stratification in Contemporary Japan: A Study in Sociology
Compulsory Categories: How Asexuality Disrupts Normative Assumptions About Sexuality and Gender
Building Alternative Food Economies for Justice and Sustainability: Organic and Agroecological Agriculture in Argentina
Measuring and Modeling Cultural Meaning in Language
“Use Your Words”: A Conversation Analytic Perspective on Deaf or Hard-of-Hearing Children’s Socialization into Oral Communication
5_scope:us
Decolonizing Healthcare: A Black Feminist Analysis of Sisters Informing Sisters on Topics of AIDS (SISTA)
Narrative Meaning Productions of Compassionate Healthcare: An Examination of Cultural Codes, Organizational Practices, and Everyday Realities
Pressured Teachers, Sanctioned Students: The Persistence of Behavior Management Systems in Elementary School Classrooms
Arrested (During) Development: County Young Adult Arrest Rate Patterning Across Space and Time
A Comparative Program Evaluation: PA ACT101 and a Summer Bridge Program
6_scope:world
The Social Pattern and Causes of Dementia Prevalence Decline in the United States
Lives in Limbo: Invisibility, Control, and Temporary Migrant Workers in the Global City
Performance of Cosmopolitanism: Temporary Migrants and Their Sense of Belonging in Dubai
How States Remember Their Past: Conflicting Representations of Political Violence in Italy, 1969-2020
Return of the Yellow Peril: The Impact of the COVID-19 Pandemic on Americans' Attitudes Toward Asians and Asian Americans in the United States
7_subj:econ
Indigenous Mexicans in New York City: Immigrant Integration, Language Use, and Identity Formation
Trajectories of Emotional Well-Being among People with Advanced Cancer: Examining Gender Differences and the Roles of Social Support and Coping Styles
LGB Rights and Resilience: An Analysis of LGB Mental Health and Healthcare
Institutions in Childhood and the Transition to Adulthood: Consequences of Criminal Justice and Child Welfare System Contact in the United States
(Re)Defining Blackness: Race, Ethnicity and the Children of African Immigrants
8_subj:health
The Hands that Feed US: Endemic Precarity and Pandemic Resistance Among Migrant Food Processing Workers
Who is in the Family?: Grandparents, Stepkin and Reproduction of Social Inequality Across Generations
Embedded in Menstrual Capitalism: Gendered Discourse and the Legal (Un)Consciousness of the Menstrual Tax Repeal Movement
Academic Library Staff's Perceptions and Lived Experiences with Librarian to Staff Incivility
Opposition behind Bars: Incarcerated Black Working-Class Women and the Tradition of Resistance in the United States, 1970–2011
9_subj:inequality
Getting Rich or Getting By? Owner-Occupant Landlords in Segregated Chicago
Crossings: <em>Borderizing </em>and <em>Borderized </em>Mobilities in an Era of Converging Crises
From Millennial Sleaze to Old World Charm: The Politics of Place Branding in Creative City Singapore
Inequality of Suicide in South Korea: Unequal Distribution of Completed Suicide and Suicidal Ideation
Three Papers on Intergenerational Relationships in China
10_subj:politics
Interpersonal Discrimination and Older Latinx Adults in the United States
“<em>Everybody Is Locked Up</em>”...Black Families with Incarcerated Loved Ones
Integrating Graph and Language Data in Machine Learning Models: Applications for Computational Social Science
A Geopolitical Analysis of Innovation for Middle-Income Escape: Interpreting China's Technology Strategy
An Auto/Ethnographic Exploration of the Effects of a Female Reservist’s Deployment on a Service Member’s Support System: The Theory of Deployment Communication and Resilience for a Support System
11_subj:race
Social Capital and Health in East Asia and China: a Conceptual and Empirical Analysis
The Impact of Fixed-Term Versus Traditional Employment Contracts on Employees in College and University Housing and Residence Life Organizations
Locked Out: Women’s Housing Insecurity in a Hostile Social Environment
The Matrix of Care: Medical Decision-Making and Hospital-Based Childbirth
Invisibly Inked: An Intersectional Analysis of Tattooed Female Arrest Patterns
Other_1
The Model Minority Myth and the Mental Well-Being of Academically Struggling Asian Americans
Experiences of a Conditional and Punitive Social Safety Net: The Short- and Long-Term Impacts on Family Wellbeing
Exploring the Cybercrime Capacity and Capability of Local Law Enforcement Agencies in the United States
Breaking Barriers: Gender, Empowerment, and Women's Mixed Martial Arts
Should I Stay or Should I Go? Mexican Return Migration across the Life Course
Other_2
Finding the Meaning of Naloxone: Perceptions of the Administration of an Opioid Antagonist Drug among Police Officers
Global Networks and City Development, 1993-2020
The Rise, Endurance, and Fall of Migrant Camps on the U.S.-Mexico Border: A Sociology of Border Violence
Off-Time Illness: when Young Adults Get Illnesses Associated with Old Age
Poverty in the United States: Examining Prevalences, Penalties, and Ethno-Racial Differences, 1993-2016
Other_3
Watch Momma Work: Black Women Navigate Motherhood, Employment, and Education
“Eating Clean”: Negotiations of Power, Politics, and Knowledge Within Alternative &amp; Fringe Health Movements in the US
The Effect of Migration Status on Children’s Academic Performance in China
Food-as-Medicine: An Everyday Strategy of Health
No Trespass Zones: Governing the Mobility of Citizens and Migrants in the Post-Civil Rights Era
Code
{lda_key %>% keyATM::plot_topicprop(
  n = 5,
  show_topic = NULL,
  show_topwords = TRUE,
  label_topic = NULL,
  order = "proportion")}[["figure"]] %>%
  plotly::ggplotly()

Bi-variate analysis

Code
coded_isbns_wide.df <- coded_isbns.df %>%
  select(-coding_src) %>%
  filter(value) %>%
  distinct()  %>% 
  pivot_wider(values_from=value, names_from=topic, values_fill=FALSE)

diss_coded_complete.df <-diss_cleaned_plus_gender.df %>% 
  left_join(coded_isbns_wide.df, by="isbn")
Code
diss_coded_complete.df %>%
  group_by(`meth:qual`) %>%
  count(`subj:inequality`)
# A tibble: 4 × 3
# Groups:   meth:qual [2]
  `meth:qual` `subj:inequality`     n
  <lgl>       <lgl>             <int>
1 FALSE       FALSE               355
2 FALSE       TRUE                412
3 TRUE        FALSE               438
4 TRUE        TRUE                667
Code
diss_coded_complete.df %>% 
  group_by(`meth:qual`) %>%
  summarize(pr_female=mean(og_pr_F, na.rm=TRUE))
# A tibble: 2 × 2
  `meth:qual` pr_female
  <lgl>           <dbl>
1 FALSE           0.538
2 TRUE            0.605
Code
diss_coded_complete.df %>% 
  group_by(`meth:quant`) %>%
  summarize(pr_female=mean(og_pr_F, na.rm=TRUE))
# A tibble: 2 × 2
  `meth:quant` pr_female
  <lgl>            <dbl>
1 FALSE            0.576
2 TRUE             0.582